EA count data

# WD
setwd("~/PhD/London NERC DTP documents/EA data/EnvironmentAgency_EcologyFish_DataDownload_2025-04-02")

# Load data
counts <- read_csv("TraC_Fish_Counts_2025-10-10.csv")
sites <- read_csv("TraC_Fish_Sites_2025-10-10.csv")

# Join count data with site information
counts_clean <- counts %>% left_join(sites, by = "SITE_ID") %>%
  mutate(EVENT_DATE = ymd(EVENT_DATE),
         EVENT_YEAR = year(EVENT_DATE),
         EVENT_MONTH = month(EVENT_DATE),
         SEASON = if_else(EVENT_MONTH %in% 4:9, "Summer", "Winter"))

counts_clean <- counts_clean %>%
  select(
    SITE_ID,
    SITE_NAME = SITE_NAME.x,
    SITE_PARENT_NAME,
    SURVEY_ID,
    EVENT_DATE,
    SPECIES_NAME,
    LATIN_NAME,
    FISH_COUNT,
    NO_OF_RUNS,
    TOP_TIER_SITE,
    SITE_RANKED_NGR,
    SITE_RANKED_EASTING,
    SITE_RANKED_NORTHING)

# Convert dates and extract components
counts_clean <- counts_clean %>%
  mutate(
    EVENT_DATE = ymd(EVENT_DATE),
    EVENT_DATE_YEAR = year(EVENT_DATE),
    EVENT_MONTH = month(EVENT_DATE),
    EVENT_DAY = day(EVENT_DATE))

# Sampling method
counts_clean <- counts_clean %>%
  mutate(
    METHOD = case_when(
      str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
      str_detect(str_to_lower(SITE_NAME), "beam") ~ "Beam Trawl",
      str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
      str_detect(str_to_lower(SITE_NAME), "kick") ~ "Kick Sample",
      str_detect(str_to_lower(SITE_NAME), "fyke") ~ "Fyke Net",
      str_detect(str_to_lower(SITE_NAME), "push") ~ "Push Net",
      str_detect(str_to_lower(SITE_NAME), "hand") ~ "Hand Collection",
      str_detect(str_to_lower(SITE_NAME), "gill") ~ "Gill Net",
      TRUE ~ "Unknown"))

# Thames only
counts_thames <- counts_clean %>%
  filter(str_detect(TOP_TIER_SITE, regex("thames", ignore_case = TRUE)))

# Lat and long
sites_sf <- st_as_sf(counts_thames, coords = c("SITE_RANKED_EASTING", "SITE_RANKED_NORTHING"), crs = 27700) %>%
  st_transform(crs = 4326)
coords <- st_coordinates(sites_sf)
sites_sf$lon <- coords[, 1]
sites_sf$lat <- coords[, 2]

# Gears
sites_sf <- sites_sf %>%
  mutate(
    METHOD_TYPE = case_when(
      str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
      str_detect(str_to_lower(SITE_NAME), "beam") ~ "Beam Trawl",
      str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
      str_detect(str_to_lower(SITE_NAME), "kick") ~ "Kick Sample",
      str_detect(str_to_lower(SITE_NAME), "fyke") ~ "Fyke Net",
      str_detect(str_to_lower(SITE_NAME), "push") ~ "Push Net",
      str_detect(str_to_lower(SITE_NAME), "hand") ~ "Hand Collection",
      str_detect(str_to_lower(SITE_NAME), "gill") ~ "Gill Net",
      TRUE ~ "Unknown"))

##### Map #####
method_types <- unique(sites_sf$METHOD_TYPE)
method_pal <- colorFactor(palette = "Set2", domain = method_types)

leaflet(sites_sf) %>%
  addProviderTiles("CartoDB.Positron") %>%
  addCircleMarkers(
    lng = ~lon, lat = ~lat,
    color = ~method_pal(METHOD_TYPE),
    radius = 6,
    stroke = TRUE, weight = 1, fillOpacity = 0.9,
    popup = ~paste0(
      "<b>Site:</b> ", SITE_NAME, "<br>",
      "<b>Zone:</b> ", TOP_TIER_SITE, "<br>",
      "<b>METHOD:</b> ", METHOD_TYPE), group = ~METHOD_TYPE) %>%
  addLegend(
    position = "bottomright",
    pal = method_pal,
    values = ~METHOD_TYPE,
    title = "Method type",
    opacity = 1) %>%
  addLayersControl(
    overlayGroups = method_types,
    options = layersControlOptions(collapsed = FALSE),
    position = "topright") %>%
  addResetMapButton()
# RA
method_totals <- counts_thames %>%
  filter(!is.na(METHOD)) %>%
  group_by(METHOD) %>%
  summarise(
    Total_Individuals = sum(FISH_COUNT, na.rm = TRUE),
    Total_Species = n_distinct(SPECIES_NAME),
    .groups = "drop")

# Get grand total
grand_total <- sum(method_totals$Total_Individuals)

# Add relative abundance column
method_totals <- method_totals %>%
  mutate(
    Relative_Abundance = round(100 * Total_Individuals / grand_total, 1)
  ) %>%
  arrange(desc(Relative_Abundance)) %>%
  mutate(METHOD = factor(METHOD, levels = unique(METHOD)))

##### RA #####
ggplot(method_totals, aes(x = reorder(METHOD, -Relative_Abundance), y = Relative_Abundance)) +
  geom_col(fill = "tomato") +
  labs(
    title = "Relative abundance by sampling method",
    x = "Sampling method",
    y = "Relative abundance (%)") +
  theme_minimal()

# Matrix
method_species_table <- counts_thames %>%
  filter(!is.na(METHOD), !is.na(SPECIES_NAME)) %>%
  count(METHOD, SPECIES_NAME) %>%
  pivot_wider(names_from = METHOD, values_from = n, values_fill = 0)

method_pa <- method_species_table %>%
  mutate(across(-SPECIES_NAME, ~ as.integer(. > 0)))

method_cols <- names(method_pa)[!names(method_pa) %in% "SPECIES_NAME"]

method_unique <- method_pa %>%
  rowwise() %>%
  mutate(unique_to = if (sum(c_across(all_of(method_cols))) == 1) {
    method_cols[which(c_across(all_of(method_cols)) == 1)]
  } else {
    NA_character_
  }) %>%
  ungroup() %>%
  filter(!is.na(unique_to))

unique_counts <- method_unique %>%
  count(unique_to, name = "unique_species_count") %>%
  arrange(desc(unique_species_count))

ggplot(unique_counts, aes(x = reorder(unique_to, unique_species_count), y = unique_species_count, fill = unique_to)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(
    title = "Number of species unique to each method type",
    x = "Sampling method", y = "Unique species count") +
  theme_minimal()

method_pa_long <- method_pa %>%
  pivot_longer(cols = -SPECIES_NAME, names_to = "METHOD", values_to = "present")

ggplot(method_pa_long, aes(x = METHOD, y = SPECIES_NAME, fill = as.factor(present))) +
  geom_tile(color = "white") +
  scale_fill_manual(values = c("0" = "white", "1" = "steelblue")) +
  labs(
    title = "Species presence across sampling method",
    x = "Sampling method", y = "Species",
    fill = "Present") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 6))

Seine net data only.

# 1) Filter to Seine Net
counts_thames_filtered <- counts_thames %>%
  filter(METHOD == "Seine Net")

# 2) Per-survey CPUE (fish per run)
#    - If NO_OF_RUNS can vary within a survey, switch `first()` to `max()` and sanity-check.
seine_cpue <- counts_thames_filtered %>%
  filter(!is.na(FISH_COUNT), NO_OF_RUNS > 0) %>%
  group_by(SURVEY_ID, EVENT_DATE_YEAR) %>%
  summarise(
    total_fish = sum(FISH_COUNT, na.rm = TRUE),
    runs       = dplyr::first(NO_OF_RUNS),   # or max(NO_OF_RUNS)
    n_checks   = n_distinct(NO_OF_RUNS),     # for sanity checking
    .groups    = "drop"
  ) %>%
  mutate(CPUE = total_fish / runs)

# Optional sanity check:
# stopifnot(all(seine_cpue$n_checks == 1))

# 3) Yearly, survey-balanced summary (each survey = 1 vote)
cpue_year <- seine_cpue %>%
  group_by(EVENT_DATE_YEAR) %>%
  summarise(
    mean_CPUE = mean(CPUE, na.rm = TRUE),
    sd_CPUE   = sd(CPUE, na.rm = TRUE),
    n_surveys = n(),
    se        = sd_CPUE / sqrt(n_surveys),
    .groups   = "drop"
  )

# 4) Quick overview plot (mean ± SE)
p1 <- ggplot(cpue_year, aes(x = EVENT_DATE_YEAR, y = mean_CPUE)) +
  geom_ribbon(aes(ymin = mean_CPUE - se, ymax = mean_CPUE + se),
              alpha = 0.2, fill = "steelblue") +
  geom_line(colour = "steelblue", linewidth = 1) +
  geom_point(colour = "steelblue", size = 2) +
  labs(
    title = "Mean seine net CPUE per year – Thames Estuary (survey-balanced)",
    x = "Year",
    y = "Mean CPUE (fish per run)"
  ) +
  theme_minimal()

# 5) Trend model on yearly means
lm1 <- lm(mean_CPUE ~ EVENT_DATE_YEAR, data = cpue_year)

cpue_year$pred    <- predict(lm1)
cpue_year$pred_se <- predict(lm1, se.fit = TRUE)$se.fit

s      <- summary(lm1)
slope  <- coef(s)["EVENT_DATE_YEAR","Estimate"]
pval   <- coef(s)["EVENT_DATE_YEAR","Pr(>|t|)"]
lab    <- sprintf("LM slope = %.2f fish/run/yr (p = %.3f)", slope, pval)

# 6) Trend plot with fitted mean and 95% CI of the fit
p2 <- ggplot(cpue_year, aes(EVENT_DATE_YEAR, mean_CPUE)) +
  geom_point(size = 2) +
  geom_line(col = "grey40") +
  geom_ribbon(aes(ymin = pred - 1.96 * pred_se, ymax = pred + 1.96 * pred_se),
              fill = "lightblue", alpha = 0.35) +
  geom_line(aes(y = pred), col = "blue", linewidth = 1) +
  labs(
    title = "Trend in mean seine net CPUE – Thames Estuary (survey-balanced)",
    x = "Year", y = "Mean CPUE (fish per run)"
  ) +
  annotate("text",
           x = min(cpue_year$EVENT_DATE_YEAR, na.rm = TRUE) + 1,
           y = max(cpue_year$mean_CPUE, na.rm = TRUE),
           hjust = 0, vjust = 1, size = 3.5, label = lab) +
  theme_minimal()

p1

p2

# --- per-survey CPUE per species ---
species_svy_cpue <- counts_thames_filtered %>%
  filter(NO_OF_RUNS > 0) %>%
  group_by(SURVEY_ID, EVENT_DATE_YEAR, SPECIES_NAME) %>%
  summarise(
    count = sum(FISH_COUNT, na.rm = TRUE),
    runs  = dplyr::first(NO_OF_RUNS),
    .groups = "drop"
  ) %>%
  mutate(CPUE = count / runs)

# --- yearly, survey-balanced CPUE per species ---
species_trends <- species_svy_cpue %>%
  group_by(EVENT_DATE_YEAR, SPECIES_NAME) %>%
  summarise(
    mean_CPUE = mean(CPUE, na.rm = TRUE),
    n_surveys = n(),
    .groups = "drop"
  )

# --- per-species trend (weights = n_surveys) ---
species_model_summary <- function(df) {
  if (dplyr::n_distinct(df$EVENT_DATE_YEAR) < 3 || sd(df$mean_CPUE, na.rm = TRUE) == 0) {
    return(tibble(Slope = NA_real_, CI_low = NA_real_, CI_high = NA_real_,
                  P_value = NA_real_, Significance = NA_character_))
  }
  m <- lm(mean_CPUE ~ EVENT_DATE_YEAR, data = df, weights = n_surveys)
  s  <- summary(m)
  slope <- coef(s)["EVENT_DATE_YEAR","Estimate"]
  se    <- coef(s)["EVENT_DATE_YEAR","Std. Error"]
  p     <- coef(s)["EVENT_DATE_YEAR","Pr(>|t|)"]
  tibble(
    Slope = slope,
    CI_low  = slope - 1.96 * se,
    CI_high = slope + 1.96 * se,
    P_value = p,
    Significance = dplyr::case_when(
      p <= 0.001 ~ "***",
      p <= 0.01  ~ "**",
      p <= 0.05  ~ "*",
      TRUE       ~ ""
    )
  )
}

species_results <- species_trends %>%
  group_by(SPECIES_NAME) %>%
  group_modify(~ species_model_summary(.x)) %>%
  ungroup() %>%
  mutate(
    SigCat = factor(Significance,
                    levels = c("***","**","*",""),
                    labels = c("***","**","*","ns"))
  )

species_results_sig <- species_results %>%
  filter(P_value <= 0.05) %>%
  mutate(
    SigCat = factor(Significance,
                    levels = c("***","**","*"),
                    labels = c("***","**","*"))
  )

# --- Define colour palette for significance ---
sig_cols <- c(
  "***" = "#3b82f6",  # blue
  "**"  = "#22c55e",  # green
  "*"   = "#ef4444"   # red
)

# --- Plot only significant species ---
ggplot(species_results_sig, aes(x = Slope, y = reorder(SPECIES_NAME, Slope))) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
  geom_errorbarh(aes(xmin = CI_low, xmax = CI_high), height = 0.3, color = "gray70") +
  geom_point(aes(color = SigCat), size = 3) +
  scale_color_manual(values = sig_cols, drop = FALSE, name = "Significance",
                     breaks = c("***","**","*")) +
  labs(
    title = "Significant species-level trends in abundance (CPUE) – Thames Estuary",
    subtitle = "Only species with p ≤ 0.05 shown. Slope ± 95% CI (weighted by number of surveys).",
    x = "Slope (Δ CPUE per year)", 
    y = "Species", 
    caption = "Significance: * p ≤ 0.05 | ** p ≤ 0.01 | *** p ≤ 0.001"
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    legend.direction = "horizontal",
    legend.title = element_text(size = 11),
    legend.text = element_text(size = 10)
  )

svy_comp <- counts_thames_filtered %>%
  group_by(EVENT_DATE_YEAR, SURVEY_ID, SPECIES_NAME) %>%
  summarise(n = sum(FISH_COUNT, na.rm = TRUE), .groups = "drop_last")

div_svy <- svy_comp %>%
  summarise(
    Species_Richness  = n_distinct(SPECIES_NAME),
    Total_Abundance   = sum(n),
    Shannon_Diversity = vegan::diversity(n, index = "shannon"),
    Simpson_Diversity = vegan::diversity(n, index = "simpson"),
    .groups = "drop"
  ) %>%
  mutate(
    Margalef_Richness = (Species_Richness - 1) / log(pmax(Total_Abundance, 1)),
    Pielou_Evenness   = Shannon_Diversity / log(pmax(Species_Richness, 2))
  )

diversity_summary <- div_svy %>%
  group_by(EVENT_DATE_YEAR) %>%
  summarise(
    Richness  = mean(Species_Richness),  se_Richness  = sd(Species_Richness)/sqrt(n()),
    Shannon   = mean(Shannon_Diversity), se_Shannon   = sd(Shannon_Diversity)/sqrt(n()),
    Simpson   = mean(Simpson_Diversity), se_Simpson   = sd(Simpson_Diversity)/sqrt(n()),
    Margalef  = mean(Margalef_Richness), se_Margalef  = sd(Margalef_Richness)/sqrt(n()),
    Pielou    = mean(Pielou_Evenness),   se_Pielou    = sd(Pielou_Evenness)/sqrt(n()),
    n_surveys      = n(),
    .groups = "drop"
  )

diversity_long <- diversity_summary %>%
  select(EVENT_DATE_YEAR,
         Richness, Shannon, Simpson, Margalef, Pielou) %>%
  pivot_longer(cols = -EVENT_DATE_YEAR,
               names_to = "Metric", values_to = "Value")

ggplot(diversity_long, aes(x = EVENT_DATE_YEAR, y = Value)) +
  geom_line(color = "red", linewidth = 1) +
  geom_smooth(method = "loess", se = TRUE, color = "black", linewidth = 1) +
  facet_wrap(~ Metric, scales = "free_y") +
  labs(
    title = "Diversity metrics over time – Thames Estuary (survey-balanced)",
    subtitle = "Per-survey diversity averaged per year (LOESS trend)",
    x = "Year", y = "Diversity metric (mean)"
  ) +
  theme_minimal()

lm_richness <- lm(Richness  ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_shannon  <- lm(Shannon   ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_simpson  <- lm(Simpson   ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_margalef <- lm(Margalef  ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_evenness <- lm(Pielou    ~ EVENT_DATE_YEAR, data = diversity_summary)

summary_table <- function(model, metric_name) {
  coef_summary <- summary(model)$coefficients
  slope <- coef_summary["EVENT_DATE_YEAR", "Estimate"]
  se <- coef_summary["EVENT_DATE_YEAR", "Std. Error"]
  p <- coef_summary["EVENT_DATE_YEAR", intersect(c("Pr(>|t|)", "Pr(>|z|)"),
                                                 colnames(coef_summary))[1]]
  ci_low <- slope - 1.96 * se
  ci_high <- slope + 1.96 * se
  sig <- case_when(p <= 0.001 ~ "***", p <= 0.01 ~ "**", p <= 0.05 ~ "*", TRUE ~ "")
  data.frame(Metric = metric_name, Slope = slope, SE = se,
             CI_low = ci_low, CI_high = ci_high,
             P_value = round(p, 4), Significance = sig)
}

results <- bind_rows(
  summary_table(lm_richness,  "Species richness"),
  summary_table(lm_shannon,   "Shannon diversity"),
  summary_table(lm_simpson,   "Simpson diversity"),
  summary_table(lm_margalef,  "Margalef richness"),
  summary_table(lm_evenness,  "Pielou evenness")
)

results_lab <- results %>%
  mutate(
    SigCat = factor(Significance,
                    levels = c("***","**","*",""),
                    labels = c("***","**","*","ns")),
    Metric_lab = dplyr::case_when(
      Metric == "Species richness" ~ Metric,                 # never add stars here
      Significance == ""           ~ Metric,                 # ns → no stars
      TRUE                         ~ paste0(Metric, " ", Significance)
    )
  )

sig_cols <- c(
  "***" = "#3b82f6",  # blue
  "**"  = "#22c55e",  # green
  "*"   = "#ef4444",  # red
  "ns"  = "#9ca3af"   # grey
)

ggplot(results_lab, aes(x = Slope, y = reorder(Metric_lab, Slope))) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
  geom_errorbarh(aes(xmin = CI_low, xmax = CI_high), height = 0.22, color = "gray70") +
  geom_point(aes(color = SigCat), size = 3) +
  scale_color_manual(values = sig_cols, drop = FALSE, name = "Significance",
                     breaks = c("***","**","*","ns")) +
  labs(
    title = "Trend in diversity metrics over time – Thames Estuary",
    subtitle = "Slope ± 95% CI from linear models on yearly survey-balanced means",
    x = "Slope (change per year)", y = "Diversity metric (mean)",
    caption = "Significance: * p ≤ 0.05 | ** p ≤ 0.01 | *** p ≤ 0.001"
  ) +
  theme_minimal() +
  theme(legend.position = "top",
        legend.direction = "horizontal",
        legend.title = element_text(size = 11),
        legend.text  = element_text(size = 10))

pa_svy <- counts_thames_filtered %>%
  filter(!is.na(SPECIES_NAME), !is.na(FISH_COUNT)) %>%
  mutate(PA = as.integer(FISH_COUNT > 0)) %>%
  group_by(EVENT_DATE_YEAR, SURVEY_ID, SPECIES_NAME) %>%
  summarise(present = as.integer(any(PA == 1)), .groups = "drop")

svy_per_year <- pa_svy %>%
  distinct(EVENT_DATE_YEAR, SURVEY_ID) %>%
  count(EVENT_DATE_YEAR, name = "n_surveys")

min_svy <- min(svy_per_year$n_surveys)

set.seed(42)
sampled_surveys <- pa_svy %>%
  distinct(EVENT_DATE_YEAR, SURVEY_ID) %>%
  group_by(EVENT_DATE_YEAR) %>%
  slice_sample(n = min_svy) %>%
  ungroup()

year_species_pa <- pa_svy %>%
  semi_join(sampled_surveys, by = c("EVENT_DATE_YEAR","SURVEY_ID")) %>%
  group_by(EVENT_DATE_YEAR, SPECIES_NAME) %>%
  summarise(present = as.integer(any(present == 1)), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = SPECIES_NAME, values_from = present, values_fill = 0) %>%
  tibble::column_to_rownames("EVENT_DATE_YEAR") %>%
  as.data.frame()

beta_core <- betapart::betapart.core(year_species_pa)
beta_res  <- betapart::beta.pair(beta_core, index.family = "sor")

sor_mat <- as.matrix(beta_res$beta.sor)
sim_mat <- as.matrix(beta_res$beta.sim)
sne_mat <- as.matrix(beta_res$beta.sne)

years <- sort(as.numeric(rownames(sor_mat)))

beta_trends <- tibble::tibble(
  Year1 = years[-length(years)],
  Year2 = years[-1],
  beta_sor = purrr::map2_dbl(Year1, Year2, ~ sor_mat[as.character(.x), as.character(.y)]),
  beta_sim = purrr::map2_dbl(Year1, Year2, ~ sim_mat[as.character(.x), as.character(.y)]),
  beta_sne = purrr::map2_dbl(Year1, Year2, ~ sne_mat[as.character(.x), as.character(.y)])
)

lm_sor <- lm(beta_sor ~ Year2, data = beta_trends)
lm_sim <- lm(beta_sim ~ Year2, data = beta_trends)
lm_sne <- lm(beta_sne ~ Year2, data = beta_trends)

get_lm_stats <- function(model) {
  sm <- summary(model)
  list(r2 = round(sm$r.squared, 3),
       p  = round(sm$coefficients[2, 4], 3))
}

sor_stats <- get_lm_stats(lm_sor)
sim_stats <- get_lm_stats(lm_sim)
sne_stats <- get_lm_stats(lm_sne)

beta_trends_long <- beta_trends %>%
  tidyr::pivot_longer(c(beta_sor, beta_sim, beta_sne),
                      names_to = "Component", values_to = "Dissimilarity") %>%
  dplyr::mutate(Component = dplyr::recode(Component,
                                          beta_sor = "Sorensen", beta_sim = "Turnover", beta_sne = "Nestedness"))

subtitle_text <- paste0(
  "Sorensen: R² = ", sor_stats$r2, ", p = ", sor_stats$p, "   |   ",
  "Turnover: R² = ", sim_stats$r2, ", p = ", sim_stats$p, "   |   ",
  "Nestedness: R² = ", sne_stats$r2, ", p = ", sne_stats$p)


ggplot(beta_trends_long, aes(x = Year2, y = Dissimilarity, color = Component)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  scale_color_manual(values = c("Sorensen" = "darkorchid",
                                "Turnover" = "steelblue",
                                "Nestedness" = "darkorange")) +
  labs(
    title = "Year-to-year change in beta diversity components – Thames Estuary (survey-balanced)",
    subtitle = subtitle_text,
    x = "Year", y = "Dissimilarity", color = "Component"
  ) +
  theme_minimal()

EA length data

# --- Setup & load ---
setwd("~/PhD/London NERC DTP documents/EA data/EnvironmentAgency_EcologyFish_DataDownload_2025-04-02")

lengths <- read_csv("TraC_Fish_Individual_Lengths_2025-10-10.csv")
sites   <- read_csv("TraC_Fish_Sites_2025-10-10.csv")

# Join + tidy
lengths_clean <- lengths %>%
  left_join(sites, by = "SITE_ID") %>%
  select(
    SITE_ID,
    SITE_NAME = SITE_NAME.x,
    SITE_PARENT_NAME,
    EVENT_DATE,
    SURVEY_ID,
    SPECIES_NAME,
    LATIN_NAME,
    FISH_LENGTH,
    TOP_TIER_SITE,
    SITE_RANKED_NGR,
    SITE_RANKED_EASTING,
    SITE_RANKED_NORTHING
  ) %>%
  mutate(
    EVENT_DATE = ymd(EVENT_DATE),
    EVENT_YEAR = year(EVENT_DATE),
    EVENT_MONTH = month(EVENT_DATE),
    EVENT_DAY = day(EVENT_DATE),
    METHOD = case_when(
      str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
      str_detect(str_to_lower(SITE_NAME), "beam")  ~ "Beam Trawl",
      str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
      str_detect(str_to_lower(SITE_NAME), "kick")  ~ "Kick Sample",
      str_detect(str_to_lower(SITE_NAME), "fyke")  ~ "Fyke Net",
      str_detect(str_to_lower(SITE_NAME), "push")  ~ "Push Net",
      str_detect(str_to_lower(SITE_NAME), "hand")  ~ "Hand Collection",
      str_detect(str_to_lower(SITE_NAME), "gill")  ~ "Gill Net",
      TRUE ~ "Unknown"
    )
  )

# Thames + Seine only
lengths_thames_filtered <- lengths_clean %>%
  filter(str_detect(TOP_TIER_SITE, regex("thames", ignore_case = TRUE)),
         METHOD == "Seine Net",
         !is.na(FISH_LENGTH), FISH_LENGTH > 0)

# (Optional) Species-wise IQR outlier removal
lengths_thames_filtered <- lengths_thames_filtered %>%
  group_by(LATIN_NAME) %>%
  mutate(
    Q1 = quantile(FISH_LENGTH, 0.25, na.rm = TRUE),
    Q3 = quantile(FISH_LENGTH, 0.75, na.rm = TRUE),
    IQRv = Q3 - Q1,
    lo = Q1 - 1.5 * IQRv,
    hi = Q3 + 1.5 * IQRv
  ) %>%
  ungroup() %>%
  filter(FISH_LENGTH >= lo, FISH_LENGTH <= hi)
# Per-survey summaries (one row per survey)
survey_means <- lengths_thames_filtered %>%
  group_by(EVENT_YEAR, SURVEY_ID) %>%
  summarise(
    mean_len_survey   = mean(FISH_LENGTH),
    median_len_survey = median(FISH_LENGTH),
    n_fish            = n(),
    .groups = "drop"
  )

# Yearly (survey-balanced) summaries
survey_bal <- survey_means %>%
  group_by(EVENT_YEAR) %>%
  summarise(
    mean_length   = mean(mean_len_survey),      # equal weight per survey
    median_length = median(mean_len_survey),
    sd_mean       = sd(mean_len_survey),
    n_surveys     = n(),
    se_mean       = sd_mean / sqrt(n_surveys),
    .groups = "drop"
  ) %>%
  arrange(EVENT_YEAR)

# Linear trends on survey-balanced means
lm_mean   <- lm(mean_length   ~ EVENT_YEAR, data = survey_bal)
lm_median <- lm(median_length ~ EVENT_YEAR, data = survey_bal)

get_coef <- function(mod) {
  s <- summary(mod)$coefficients
  list(slope = unname(s["EVENT_YEAR","Estimate"]),
       p     = unname(s["EVENT_YEAR","Pr(>|t|)"]))
}
lab_mean   <- with(get_coef(lm_mean),   sprintf("Mean: slope = %.2f mm/yr (p = %.3g)",   slope, p))
lab_median <- with(get_coef(lm_median), sprintf("Median: slope = %.2f mm/yr (p = %.3g)", slope, p))

# Plot with top annotations
x_left <- min(survey_bal$EVENT_YEAR, na.rm = TRUE)
ggplot(survey_bal, aes(EVENT_YEAR)) +
  geom_ribbon(aes(ymin = mean_length - se_mean, ymax = mean_length + se_mean),
              alpha = 0.15, fill = "steelblue") +
  geom_line(aes(y = mean_length, color = "Mean"), linewidth = 1.2) +
  geom_point(aes(y = mean_length, color = "Mean"), size = 2) +
  geom_smooth(aes(y = mean_length, color = "Mean"), method = "lm", se = FALSE, linewidth = 0.9) +
  geom_line(aes(y = median_length, color = "Median"), linewidth = 1.2, linetype = "dashed") +
  geom_point(aes(y = median_length, color = "Median"), size = 2, shape = 17) +
  geom_smooth(aes(y = median_length, color = "Median"), method = "lm", se = FALSE, linetype = "dashed", linewidth = 0.9) +
  annotate("text", x = x_left, y = Inf, hjust = 0, vjust = 1.4, label = lab_mean,   color = "steelblue",  size = 4) +
  annotate("text", x = x_left, y = Inf, hjust = 0, vjust = 2.8, label = lab_median, color = "darkorange", size = 4) +
  scale_y_continuous(expand = expansion(mult = c(0.02, 0.18))) +
  scale_color_manual(values = c("Mean" = "steelblue", "Median" = "darkorange")) +
  labs(title = "Survey-balanced fish length trends – Thames Estuary",
       subtitle = "Mean ± SE (solid) and median (dashed) per year with linear trends",
       x = "Year", y = "Fish length (mm)", color = "Statistic") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "top", panel.grid.minor = element_blank())

EA subsites

Seven most frequently sampled sites.

# WD
setwd("~/PhD/London NERC DTP documents/EA data/EnvironmentAgency_EcologyFish_DataDownload_2025-04-02")

# Load data
counts <- read_csv("TraC_Fish_Counts_2025-10-10.csv")
sites <- read_csv("TraC_Fish_Sites_2025-10-10.csv")

# Join count data with site information
counts_clean <- counts %>% left_join(sites, by = "SITE_ID") %>%
  mutate(EVENT_DATE = ymd(EVENT_DATE),
         EVENT_YEAR = year(EVENT_DATE),
         EVENT_MONTH = month(EVENT_DATE),
         SEASON = if_else(EVENT_MONTH %in% 4:9, "Summer", "Winter"))

counts_clean <- counts_clean %>%
  select(
    SITE_ID,
    SITE_NAME = SITE_NAME.x,
    SITE_PARENT_NAME,
    SURVEY_ID,
    EVENT_DATE,
    SPECIES_NAME,
    LATIN_NAME,
    FISH_COUNT,
    NO_OF_RUNS,
    TOP_TIER_SITE,
    SITE_RANKED_NGR,
    SITE_RANKED_EASTING,
    SITE_RANKED_NORTHING)

# Convert dates and extract components
counts_clean <- counts_clean %>%
  mutate(
    EVENT_DATE = ymd(EVENT_DATE),
    EVENT_DATE_YEAR = year(EVENT_DATE),
    EVENT_MONTH = month(EVENT_DATE),
    EVENT_DAY = day(EVENT_DATE))

# Sampling method
counts_clean <- counts_clean %>%
  mutate(
    METHOD = case_when(
      str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
      str_detect(str_to_lower(SITE_NAME), "beam") ~ "Beam Trawl",
      str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
      str_detect(str_to_lower(SITE_NAME), "kick") ~ "Kick Sample",
      str_detect(str_to_lower(SITE_NAME), "fyke") ~ "Fyke Net",
      str_detect(str_to_lower(SITE_NAME), "push") ~ "Push Net",
      str_detect(str_to_lower(SITE_NAME), "hand") ~ "Hand Collection",
      str_detect(str_to_lower(SITE_NAME), "gill") ~ "Gill Net",
      TRUE ~ "Unknown"))

# Thames only
counts_thames <- counts_clean %>%
  filter(str_detect(TOP_TIER_SITE, regex("thames", ignore_case = TRUE)))

counts_thames <- counts_thames %>%
  filter(SITE_PARENT_NAME %in% c(
    "Richmond",
    "Kew",
    "Battersea",
    "Greenwich",
    "West Thurrock",
    "Denton Wharf",
    "Stanford-le-Hope Beach"
  ))

# Lat and long
sites_sf <- st_as_sf(counts_thames, coords = c("SITE_RANKED_EASTING", "SITE_RANKED_NORTHING"), crs = 27700) %>%
  st_transform(crs = 4326)
coords <- st_coordinates(sites_sf)
sites_sf$lon <- coords[, 1]
sites_sf$lat <- coords[, 2]

# Gears
sites_sf <- sites_sf %>%
  mutate(
    METHOD_TYPE = case_when(
      str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
      str_detect(str_to_lower(SITE_NAME), "beam") ~ "Beam Trawl",
      str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
      str_detect(str_to_lower(SITE_NAME), "kick") ~ "Kick Sample",
      str_detect(str_to_lower(SITE_NAME), "fyke") ~ "Fyke Net",
      str_detect(str_to_lower(SITE_NAME), "push") ~ "Push Net",
      str_detect(str_to_lower(SITE_NAME), "hand") ~ "Hand Collection",
      str_detect(str_to_lower(SITE_NAME), "gill") ~ "Gill Net",
      TRUE ~ "Unknown"))

##### Map #####
method_types <- unique(sites_sf$METHOD_TYPE)
method_pal <- colorFactor(palette = "Set2", domain = method_types)

leaflet(sites_sf) %>%
  addProviderTiles("CartoDB.Positron") %>%
  addCircleMarkers(
    lng = ~lon, lat = ~lat,
    color = ~method_pal(METHOD_TYPE),
    radius = 6,
    stroke = TRUE, weight = 1, fillOpacity = 0.9,
    popup = ~paste0(
      "<b>Site:</b> ", SITE_NAME, "<br>",
      "<b>Zone:</b> ", TOP_TIER_SITE, "<br>",
      "<b>METHOD:</b> ", METHOD_TYPE), group = ~METHOD_TYPE) %>%
  addLegend(
    position = "bottomright",
    pal = method_pal,
    values = ~METHOD_TYPE,
    title = "Method type",
    opacity = 1) %>%
  addLayersControl(
    overlayGroups = method_types,
    options = layersControlOptions(collapsed = FALSE),
    position = "topright") %>%
  addResetMapButton()
# RA
method_totals <- counts_thames %>%
  filter(!is.na(METHOD)) %>%
  group_by(METHOD) %>%
  summarise(
    Total_Individuals = sum(FISH_COUNT, na.rm = TRUE),
    Total_Species = n_distinct(SPECIES_NAME),
    .groups = "drop")

# Get grand total
grand_total <- sum(method_totals$Total_Individuals)

# Add relative abundance column
method_totals <- method_totals %>%
  mutate(
    Relative_Abundance = round(100 * Total_Individuals / grand_total, 1)
  ) %>%
  arrange(desc(Relative_Abundance)) %>%
  mutate(METHOD = factor(METHOD, levels = unique(METHOD)))

##### RA #####
ggplot(method_totals, aes(x = reorder(METHOD, -Relative_Abundance), y = Relative_Abundance)) +
  geom_col(fill = "tomato") +
  labs(
    title = "Relative abundance by sampling method",
    x = "Sampling method",
    y = "Relative abundance (%)") +
  theme_minimal()

# Matrix
method_species_table <- counts_thames %>%
  filter(!is.na(METHOD), !is.na(SPECIES_NAME)) %>%
  count(METHOD, SPECIES_NAME) %>%
  pivot_wider(names_from = METHOD, values_from = n, values_fill = 0)

method_pa <- method_species_table %>%
  mutate(across(-SPECIES_NAME, ~ as.integer(. > 0)))

method_cols <- names(method_pa)[!names(method_pa) %in% "SPECIES_NAME"]

method_unique <- method_pa %>%
  rowwise() %>%
  mutate(unique_to = if (sum(c_across(all_of(method_cols))) == 1) {
    method_cols[which(c_across(all_of(method_cols)) == 1)]
  } else {
    NA_character_
  }) %>%
  ungroup() %>%
  filter(!is.na(unique_to))

unique_counts <- method_unique %>%
  count(unique_to, name = "unique_species_count") %>%
  arrange(desc(unique_species_count))

ggplot(unique_counts, aes(x = reorder(unique_to, unique_species_count), y = unique_species_count, fill = unique_to)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(
    title = "Number of species unique to each method type",
    x = "Sampling method", y = "Unique species count") +
  theme_minimal()

method_pa_long <- method_pa %>%
  pivot_longer(cols = -SPECIES_NAME, names_to = "METHOD", values_to = "present")

ggplot(method_pa_long, aes(x = METHOD, y = SPECIES_NAME, fill = as.factor(present))) +
  geom_tile(color = "white") +
  scale_fill_manual(values = c("0" = "white", "1" = "steelblue")) +
  labs(
    title = "Species presence across sampling method",
    x = "Sampling method", y = "Species",
    fill = "Present") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 6))

# 1) Filter to Seine Net
counts_thames_filtered <- counts_thames %>%
  filter(METHOD == "Seine Net")

# 2) Per-survey CPUE (fish per run)
#    - If NO_OF_RUNS can vary within a survey, switch `first()` to `max()` and sanity-check.
seine_cpue <- counts_thames_filtered %>%
  filter(!is.na(FISH_COUNT), NO_OF_RUNS > 0) %>%
  group_by(SURVEY_ID, EVENT_DATE_YEAR) %>%
  summarise(
    total_fish = sum(FISH_COUNT, na.rm = TRUE),
    runs       = dplyr::first(NO_OF_RUNS),   # or max(NO_OF_RUNS)
    n_checks   = n_distinct(NO_OF_RUNS),     # for sanity checking
    .groups    = "drop"
  ) %>%
  mutate(CPUE = total_fish / runs)

# Optional sanity check:
# stopifnot(all(seine_cpue$n_checks == 1))

# 3) Yearly, survey-balanced summary (each survey = 1 vote)
cpue_year <- seine_cpue %>%
  group_by(EVENT_DATE_YEAR) %>%
  summarise(
    mean_CPUE = mean(CPUE, na.rm = TRUE),
    sd_CPUE   = sd(CPUE, na.rm = TRUE),
    n_surveys = n(),
    se        = sd_CPUE / sqrt(n_surveys),
    .groups   = "drop"
  )

# 4) Quick overview plot (mean ± SE)
p1 <- ggplot(cpue_year, aes(x = EVENT_DATE_YEAR, y = mean_CPUE)) +
  geom_ribbon(aes(ymin = mean_CPUE - se, ymax = mean_CPUE + se),
              alpha = 0.2, fill = "steelblue") +
  geom_line(colour = "steelblue", linewidth = 1) +
  geom_point(colour = "steelblue", size = 2) +
  labs(
    title = "Mean seine net CPUE per year – Thames Estuary (survey-balanced)",
    x = "Year",
    y = "Mean CPUE (fish per run)"
  ) +
  theme_minimal()

# 5) Trend model on yearly means
lm1 <- lm(mean_CPUE ~ EVENT_DATE_YEAR, data = cpue_year)

cpue_year$pred    <- predict(lm1)
cpue_year$pred_se <- predict(lm1, se.fit = TRUE)$se.fit

s      <- summary(lm1)
slope  <- coef(s)["EVENT_DATE_YEAR","Estimate"]
pval   <- coef(s)["EVENT_DATE_YEAR","Pr(>|t|)"]
lab    <- sprintf("LM slope = %.2f fish/run/yr (p = %.3f)", slope, pval)

# 6) Trend plot with fitted mean and 95% CI of the fit
p2 <- ggplot(cpue_year, aes(EVENT_DATE_YEAR, mean_CPUE)) +
  geom_point(size = 2) +
  geom_line(col = "grey40") +
  geom_ribbon(aes(ymin = pred - 1.96 * pred_se, ymax = pred + 1.96 * pred_se),
              fill = "lightblue", alpha = 0.35) +
  geom_line(aes(y = pred), col = "blue", linewidth = 1) +
  labs(
    title = "Trend in mean seine net CPUE – Thames Estuary (survey-balanced)",
    x = "Year", y = "Mean CPUE (fish per run)"
  ) +
  annotate("text",
           x = min(cpue_year$EVENT_DATE_YEAR, na.rm = TRUE) + 1,
           y = max(cpue_year$mean_CPUE, na.rm = TRUE),
           hjust = 0, vjust = 1, size = 3.5, label = lab) +
  theme_minimal()

p1

p2

# --- per-survey CPUE per species ---
species_svy_cpue <- counts_thames_filtered %>%
  filter(NO_OF_RUNS > 0) %>%
  group_by(SURVEY_ID, EVENT_DATE_YEAR, SPECIES_NAME) %>%
  summarise(
    count = sum(FISH_COUNT, na.rm = TRUE),
    runs  = dplyr::first(NO_OF_RUNS),
    .groups = "drop"
  ) %>%
  mutate(CPUE = count / runs)

# --- yearly, survey-balanced CPUE per species ---
species_trends <- species_svy_cpue %>%
  group_by(EVENT_DATE_YEAR, SPECIES_NAME) %>%
  summarise(
    mean_CPUE = mean(CPUE, na.rm = TRUE),
    n_surveys = n(),
    .groups = "drop"
  )

# --- per-species trend (weights = n_surveys) ---
species_model_summary <- function(df) {
  if (dplyr::n_distinct(df$EVENT_DATE_YEAR) < 3 || sd(df$mean_CPUE, na.rm = TRUE) == 0) {
    return(tibble(Slope = NA_real_, CI_low = NA_real_, CI_high = NA_real_,
                  P_value = NA_real_, Significance = NA_character_))
  }
  m <- lm(mean_CPUE ~ EVENT_DATE_YEAR, data = df, weights = n_surveys)
  s  <- summary(m)
  slope <- coef(s)["EVENT_DATE_YEAR","Estimate"]
  se    <- coef(s)["EVENT_DATE_YEAR","Std. Error"]
  p     <- coef(s)["EVENT_DATE_YEAR","Pr(>|t|)"]
  tibble(
    Slope = slope,
    CI_low  = slope - 1.96 * se,
    CI_high = slope + 1.96 * se,
    P_value = p,
    Significance = dplyr::case_when(
      p <= 0.001 ~ "***",
      p <= 0.01  ~ "**",
      p <= 0.05  ~ "*",
      TRUE       ~ ""
    )
  )
}

species_results <- species_trends %>%
  group_by(SPECIES_NAME) %>%
  group_modify(~ species_model_summary(.x)) %>%
  ungroup() %>%
  mutate(
    SigCat = factor(Significance,
                    levels = c("***","**","*",""),
                    labels = c("***","**","*","ns"))
  )

species_results_sig <- species_results %>%
  filter(P_value <= 0.05) %>%
  mutate(
    SigCat = factor(Significance,
                    levels = c("***","**","*"),
                    labels = c("***","**","*"))
  )

# --- Define colour palette for significance ---
sig_cols <- c(
  "***" = "#3b82f6",  # blue
  "**"  = "#22c55e",  # green
  "*"   = "#ef4444"   # red
)

# --- Plot only significant species ---
ggplot(species_results_sig, aes(x = Slope, y = reorder(SPECIES_NAME, Slope))) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
  geom_errorbarh(aes(xmin = CI_low, xmax = CI_high), height = 0.3, color = "gray70") +
  geom_point(aes(color = SigCat), size = 3) +
  scale_color_manual(values = sig_cols, drop = FALSE, name = "Significance",
                     breaks = c("***","**","*")) +
  labs(
    title = "Significant species-level trends in abundance (CPUE) – Thames Estuary",
    subtitle = "Only species with p ≤ 0.05 shown. Slope ± 95% CI (weighted by number of surveys).",
    x = "Slope (Δ CPUE per year)", 
    y = "Species", 
    caption = "Significance: * p ≤ 0.05 | ** p ≤ 0.01 | *** p ≤ 0.001"
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    legend.direction = "horizontal",
    legend.title = element_text(size = 11),
    legend.text = element_text(size = 10)
  )

svy_comp <- counts_thames_filtered %>%
  group_by(EVENT_DATE_YEAR, SURVEY_ID, SPECIES_NAME) %>%
  summarise(n = sum(FISH_COUNT, na.rm = TRUE), .groups = "drop_last")

div_svy <- svy_comp %>%
  summarise(
    Species_Richness  = n_distinct(SPECIES_NAME),
    Total_Abundance   = sum(n),
    Shannon_Diversity = vegan::diversity(n, index = "shannon"),
    Simpson_Diversity = vegan::diversity(n, index = "simpson"),
    .groups = "drop"
  ) %>%
  mutate(
    Margalef_Richness = (Species_Richness - 1) / log(pmax(Total_Abundance, 1)),
    Pielou_Evenness   = Shannon_Diversity / log(pmax(Species_Richness, 2))
  )

diversity_summary <- div_svy %>%
  group_by(EVENT_DATE_YEAR) %>%
  summarise(
    Richness  = mean(Species_Richness),  se_Richness  = sd(Species_Richness)/sqrt(n()),
    Shannon   = mean(Shannon_Diversity), se_Shannon   = sd(Shannon_Diversity)/sqrt(n()),
    Simpson   = mean(Simpson_Diversity), se_Simpson   = sd(Simpson_Diversity)/sqrt(n()),
    Margalef  = mean(Margalef_Richness), se_Margalef  = sd(Margalef_Richness)/sqrt(n()),
    Pielou    = mean(Pielou_Evenness),   se_Pielou    = sd(Pielou_Evenness)/sqrt(n()),
    n_surveys      = n(),
    .groups = "drop"
  )

diversity_long <- diversity_summary %>%
  select(EVENT_DATE_YEAR,
         Richness, Shannon, Simpson, Margalef, Pielou) %>%
  pivot_longer(cols = -EVENT_DATE_YEAR,
               names_to = "Metric", values_to = "Value")

ggplot(diversity_long, aes(x = EVENT_DATE_YEAR, y = Value)) +
  geom_line(color = "red", linewidth = 1) +
  geom_smooth(method = "loess", se = TRUE, color = "black", linewidth = 1) +
  facet_wrap(~ Metric, scales = "free_y") +
  labs(
    title = "Diversity metrics over time – Thames Estuary (survey-balanced)",
    subtitle = "Per-survey diversity averaged per year (LOESS trend)",
    x = "Year", y = "Diversity metric (mean)"
  ) +
  theme_minimal()

lm_richness <- lm(Richness  ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_shannon  <- lm(Shannon   ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_simpson  <- lm(Simpson   ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_margalef <- lm(Margalef  ~ EVENT_DATE_YEAR, data = diversity_summary)
lm_evenness <- lm(Pielou    ~ EVENT_DATE_YEAR, data = diversity_summary)

summary_table <- function(model, metric_name) {
  coef_summary <- summary(model)$coefficients
  slope <- coef_summary["EVENT_DATE_YEAR", "Estimate"]
  se <- coef_summary["EVENT_DATE_YEAR", "Std. Error"]
  p <- coef_summary["EVENT_DATE_YEAR", intersect(c("Pr(>|t|)", "Pr(>|z|)"),
                                                 colnames(coef_summary))[1]]
  ci_low <- slope - 1.96 * se
  ci_high <- slope + 1.96 * se
  sig <- case_when(p <= 0.001 ~ "***", p <= 0.01 ~ "**", p <= 0.05 ~ "*", TRUE ~ "")
  data.frame(Metric = metric_name, Slope = slope, SE = se,
             CI_low = ci_low, CI_high = ci_high,
             P_value = round(p, 4), Significance = sig)
}

results <- bind_rows(
  summary_table(lm_richness,  "Species richness"),
  summary_table(lm_shannon,   "Shannon diversity"),
  summary_table(lm_simpson,   "Simpson diversity"),
  summary_table(lm_margalef,  "Margalef richness"),
  summary_table(lm_evenness,  "Pielou evenness")
)

results_lab <- results %>%
  mutate(
    SigCat = factor(Significance,
                    levels = c("***","**","*",""),
                    labels = c("***","**","*","ns")),
    Metric_lab = Metric   # <- keep metric names clean, no stars
  )

sig_cols <- c(
  "***" = "#3b82f6",  # blue
  "**"  = "#22c55e",  # green
  "*"   = "#ef4444",  # red
  "ns"  = "#9ca3af"   # grey
)

ggplot(results_lab, aes(x = Slope, y = reorder(Metric_lab, Slope))) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
  geom_errorbarh(aes(xmin = CI_low, xmax = CI_high), height = 0.22, color = "gray70") +
  geom_point(aes(color = SigCat), size = 3) +
  scale_color_manual(values = sig_cols, drop = FALSE, name = "Significance",
                     breaks = c("***","**","*","ns")) +
  labs(
    title = "Trend in diversity metrics over time – Thames Estuary",
    subtitle = "Slope ± 95% CI from linear models on yearly survey-balanced means",
    x = "Slope (change per year)", y = "Diversity metric (mean)",
    caption = "Significance: * p ≤ 0.05 | ** p ≤ 0.01 | *** p ≤ 0.001"
  ) +
  theme_minimal() +
  theme(legend.position = "top",
        legend.direction = "horizontal",
        legend.title = element_text(size = 11),
        legend.text  = element_text(size = 10))

pa_svy <- counts_thames_filtered %>%
  filter(!is.na(SPECIES_NAME), !is.na(FISH_COUNT)) %>%
  mutate(PA = as.integer(FISH_COUNT > 0)) %>%
  group_by(EVENT_DATE_YEAR, SURVEY_ID, SPECIES_NAME) %>%
  summarise(present = as.integer(any(PA == 1)), .groups = "drop")

svy_per_year <- pa_svy %>%
  distinct(EVENT_DATE_YEAR, SURVEY_ID) %>%
  count(EVENT_DATE_YEAR, name = "n_surveys")

min_svy <- min(svy_per_year$n_surveys)

set.seed(42)
sampled_surveys <- pa_svy %>%
  distinct(EVENT_DATE_YEAR, SURVEY_ID) %>%
  group_by(EVENT_DATE_YEAR) %>%
  slice_sample(n = min_svy) %>%
  ungroup()

year_species_pa <- pa_svy %>%
  semi_join(sampled_surveys, by = c("EVENT_DATE_YEAR","SURVEY_ID")) %>%
  group_by(EVENT_DATE_YEAR, SPECIES_NAME) %>%
  summarise(present = as.integer(any(present == 1)), .groups = "drop") %>%
  tidyr::pivot_wider(names_from = SPECIES_NAME, values_from = present, values_fill = 0) %>%
  tibble::column_to_rownames("EVENT_DATE_YEAR") %>%
  as.data.frame()

beta_core <- betapart::betapart.core(year_species_pa)
beta_res  <- betapart::beta.pair(beta_core, index.family = "sor")

sor_mat <- as.matrix(beta_res$beta.sor)
sim_mat <- as.matrix(beta_res$beta.sim)
sne_mat <- as.matrix(beta_res$beta.sne)

years <- sort(as.numeric(rownames(sor_mat)))

beta_trends <- tibble::tibble(
  Year1 = years[-length(years)],
  Year2 = years[-1],
  beta_sor = purrr::map2_dbl(Year1, Year2, ~ sor_mat[as.character(.x), as.character(.y)]),
  beta_sim = purrr::map2_dbl(Year1, Year2, ~ sim_mat[as.character(.x), as.character(.y)]),
  beta_sne = purrr::map2_dbl(Year1, Year2, ~ sne_mat[as.character(.x), as.character(.y)])
)

lm_sor <- lm(beta_sor ~ Year2, data = beta_trends)
lm_sim <- lm(beta_sim ~ Year2, data = beta_trends)
lm_sne <- lm(beta_sne ~ Year2, data = beta_trends)

get_lm_stats <- function(model) {
  sm <- summary(model)
  list(r2 = round(sm$r.squared, 3),
       p  = round(sm$coefficients[2, 4], 3))
}

sor_stats <- get_lm_stats(lm_sor)
sim_stats <- get_lm_stats(lm_sim)
sne_stats <- get_lm_stats(lm_sne)

beta_trends_long <- beta_trends %>%
  tidyr::pivot_longer(c(beta_sor, beta_sim, beta_sne),
                      names_to = "Component", values_to = "Dissimilarity") %>%
  dplyr::mutate(Component = dplyr::recode(Component,
                                          beta_sor = "Sorensen", beta_sim = "Turnover", beta_sne = "Nestedness"))

subtitle_text <- paste0(
  "Sorensen: R² = ", sor_stats$r2, ", p = ", sor_stats$p, "   |   ",
  "Turnover: R² = ", sim_stats$r2, ", p = ", sim_stats$p, "   |   ",
  "Nestedness: R² = ", sne_stats$r2, ", p = ", sne_stats$p)


ggplot(beta_trends_long, aes(x = Year2, y = Dissimilarity, color = Component)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  scale_color_manual(values = c("Sorensen" = "darkorchid",
                                "Turnover" = "steelblue",
                                "Nestedness" = "darkorange")) +
  labs(
    title = "Year-to-year change in beta diversity components – Thames Estuary (survey-balanced)",
    subtitle = subtitle_text,
    x = "Year", y = "Dissimilarity", color = "Component"
  ) +
  theme_minimal()

# --- Setup & load ---
setwd("~/PhD/London NERC DTP documents/EA data/EnvironmentAgency_EcologyFish_DataDownload_2025-04-02")

lengths <- read_csv("TraC_Fish_Individual_Lengths_2025-10-10.csv")
sites   <- read_csv("TraC_Fish_Sites_2025-10-10.csv")

# Join + tidy
lengths_clean <- lengths %>%
  left_join(sites, by = "SITE_ID") %>%
  select(
    SITE_ID,
    SITE_NAME = SITE_NAME.x,
    SITE_PARENT_NAME,
    EVENT_DATE,
    SURVEY_ID,
    SPECIES_NAME,
    LATIN_NAME,
    FISH_LENGTH,
    TOP_TIER_SITE,
    SITE_RANKED_NGR,
    SITE_RANKED_EASTING,
    SITE_RANKED_NORTHING
  ) %>%
  mutate(
    EVENT_DATE = ymd(EVENT_DATE),
    EVENT_YEAR = year(EVENT_DATE),
    EVENT_MONTH = month(EVENT_DATE),
    EVENT_DAY = day(EVENT_DATE),
    METHOD = case_when(
      str_detect(str_to_lower(SITE_NAME), "seine") ~ "Seine Net",
      str_detect(str_to_lower(SITE_NAME), "beam")  ~ "Beam Trawl",
      str_detect(str_to_lower(SITE_NAME), "otter") ~ "Otter Trawl",
      str_detect(str_to_lower(SITE_NAME), "kick")  ~ "Kick Sample",
      str_detect(str_to_lower(SITE_NAME), "fyke")  ~ "Fyke Net",
      str_detect(str_to_lower(SITE_NAME), "push")  ~ "Push Net",
      str_detect(str_to_lower(SITE_NAME), "hand")  ~ "Hand Collection",
      str_detect(str_to_lower(SITE_NAME), "gill")  ~ "Gill Net",
      TRUE ~ "Unknown"
    )
  )

# Thames + Seine only
lengths_thames_filtered <- lengths_clean %>%
  filter(str_detect(TOP_TIER_SITE, regex("thames", ignore_case = TRUE)),
         METHOD == "Seine Net",
         !is.na(FISH_LENGTH), FISH_LENGTH > 0)


lengths_thames_filtered2 <- lengths_thames_filtered %>%
  filter(SITE_PARENT_NAME %in% c(
    "Richmond",
    "Kew",
    "Battersea",
    "Greenwich",
    "West Thurrock",
    "Denton Wharf",
    "Stanford-le-Hope Beach"
  ))

# (Optional) Species-wise IQR outlier removal
lengths_thames_filtered <- lengths_thames_filtered2 %>%
  group_by(LATIN_NAME) %>%
  mutate(
    Q1 = quantile(FISH_LENGTH, 0.25, na.rm = TRUE),
    Q3 = quantile(FISH_LENGTH, 0.75, na.rm = TRUE),
    IQRv = Q3 - Q1,
    lo = Q1 - 1.5 * IQRv,
    hi = Q3 + 1.5 * IQRv
  ) %>%
  ungroup() %>%
  filter(FISH_LENGTH >= lo, FISH_LENGTH <= hi)


# Per-survey summaries (one row per survey)
survey_means <- lengths_thames_filtered %>%
  group_by(EVENT_YEAR, SURVEY_ID) %>%
  summarise(
    mean_len_survey   = mean(FISH_LENGTH),
    median_len_survey = median(FISH_LENGTH),
    n_fish            = n(),
    .groups = "drop"
  )

# Yearly (survey-balanced) summaries
survey_bal <- survey_means %>%
  group_by(EVENT_YEAR) %>%
  summarise(
    mean_length   = mean(mean_len_survey),      # equal weight per survey
    median_length = median(mean_len_survey),
    sd_mean       = sd(mean_len_survey),
    n_surveys     = n(),
    se_mean       = sd_mean / sqrt(n_surveys),
    .groups = "drop"
  ) %>%
  arrange(EVENT_YEAR)

# Linear trends on survey-balanced means
lm_mean   <- lm(mean_length   ~ EVENT_YEAR, data = survey_bal)
lm_median <- lm(median_length ~ EVENT_YEAR, data = survey_bal)

get_coef <- function(mod) {
  s <- summary(mod)$coefficients
  list(slope = unname(s["EVENT_YEAR","Estimate"]),
       p     = unname(s["EVENT_YEAR","Pr(>|t|)"]))
}
lab_mean   <- with(get_coef(lm_mean),   sprintf("Mean: slope = %.2f mm/yr (p = %.3g)",   slope, p))
lab_median <- with(get_coef(lm_median), sprintf("Median: slope = %.2f mm/yr (p = %.3g)", slope, p))

# Plot with top annotations
x_left <- min(survey_bal$EVENT_YEAR, na.rm = TRUE)
ggplot(survey_bal, aes(EVENT_YEAR)) +
  geom_ribbon(aes(ymin = mean_length - se_mean, ymax = mean_length + se_mean),
              alpha = 0.15, fill = "steelblue") +
  geom_line(aes(y = mean_length, color = "Mean"), linewidth = 1.2) +
  geom_point(aes(y = mean_length, color = "Mean"), size = 2) +
  geom_smooth(aes(y = mean_length, color = "Mean"), method = "lm", se = FALSE, linewidth = 0.9) +
  geom_line(aes(y = median_length, color = "Median"), linewidth = 1.2, linetype = "dashed") +
  geom_point(aes(y = median_length, color = "Median"), size = 2, shape = 17) +
  geom_smooth(aes(y = median_length, color = "Median"), method = "lm", se = FALSE, linetype = "dashed", linewidth = 0.9) +
  annotate("text", x = x_left, y = Inf, hjust = 0, vjust = 1.4, label = lab_mean,   color = "steelblue",  size = 4) +
  annotate("text", x = x_left, y = Inf, hjust = 0, vjust = 2.8, label = lab_median, color = "darkorange", size = 4) +
  scale_y_continuous(expand = expansion(mult = c(0.02, 0.18))) +
  scale_color_manual(values = c("Mean" = "steelblue", "Median" = "darkorange")) +
  labs(title = "Survey-balanced fish length trends – Thames Estuary",
       subtitle = "Mean ± SE (solid) and median (dashed) per year with linear trends",
       x = "Year", y = "Fish length (mm)", color = "Statistic") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "top", panel.grid.minor = element_blank())

# Thames + Seine only
lengths_thames_filtered <- lengths_clean %>%
  filter(str_detect(TOP_TIER_SITE, regex("thames", ignore_case = TRUE)),
         METHOD == "Seine Net",
         !is.na(FISH_LENGTH), FISH_LENGTH > 0)


lengths_thames_filtered2 <- lengths_thames_filtered %>%
  filter(SITE_PARENT_NAME %in% c(
    "Richmond",
    "Kew",
    "Battersea",
    "Greenwich",
    "West Thurrock",
    "Denton Wharf",
    "Stanford-le-Hope Beach"
  ))

len_dat <- lengths_thames_filtered2 %>%
  mutate(LATIN_NAME = str_squish(LATIN_NAME)) %>%
  filter(!is.na(FISH_LENGTH), FISH_LENGTH > 0,
         !is.na(EVENT_YEAR), !is.na(SITE_ID)) %>%
  mutate(Yc = EVENT_YEAR - mean(EVENT_YEAR, na.rm = TRUE))

# keep species with enough temporal info overall
keep <- len_dat %>%
  group_by(LATIN_NAME) %>%
  summarise(n_years = n_distinct(EVENT_YEAR), n = n(), .groups = "drop") %>%
  filter(n_years >= 3, n >= 30) %>%
  pull(LATIN_NAME)

dat_keep <- len_dat %>% filter(LATIN_NAME %in% keep)

fit_one <- function(df) {
  n_obs   <- nrow(df)
  n_sites <- dplyr::n_distinct(df$SITE_ID)
  max_rep <- if (n_obs == 0) 0 else df %>% count(SITE_ID, name = "n") %>% pull(n) %>% max(na.rm = TRUE)
  use_lmm <- (max_rep >= 2) && (n_sites < n_obs)
  
  if (use_lmm) {
    fit <- try(suppressWarnings(lmer(FISH_LENGTH ~ Yc + (1|SITE_ID), data = df, REML = TRUE)), silent = TRUE)
    if (inherits(fit, "try-error")) return(NULL)
    
    s  <- summary(fit)
    co <- coef(s)
    
    # extract slope/SE
    beta <- unname(co["Yc","Estimate"])
    se   <- unname(co["Yc","Std. Error"])
    
    # try to get p directly; otherwise compute from t as normal approx
    pcol <- intersect(colnames(co), c("Pr(>|t|)", "Pr(>|z|)"))
    if (length(pcol) == 1) {
      pval <- unname(co["Yc", pcol])
    } else {
      tcol <- intersect(colnames(co), c("t value","z value","t.value","z.value"))
      tval <- unname(co["Yc", tcol])
      # Normal approx fallback (conservative would use Satterthwaite via lmerTest)
      pval <- 2 * pnorm(abs(tval), lower.tail = FALSE)
    }
    
    r2_m <- r2_c <- NA_real_
    r2 <- try(suppressWarnings(suppressMessages(performance::r2_nakagawa(fit))), silent = TRUE)
    if (!inherits(r2, "try-error")) { r2_m <- r2$R2_marginal; r2_c <- r2$R2_conditional }
    
    return(tibble(
      model = "LMM (1|SITE)",
      Slope = beta,
      SE    = se,
      p     = pval,
      r2_m  = r2_m,
      r2_c  = r2_c,
      n_obs = n_obs, n_sites = n_sites, max_rep = max_rep
    ))
  }
  
  # LM fallback
  fit <- try(suppressWarnings(lm(FISH_LENGTH ~ Yc, data = df)), silent = TRUE)
  if (inherits(fit, "try-error")) return(NULL)
  s  <- summary(fit); co <- coef(s)
  
  tibble(
    model = "LM",
    Slope = unname(co["Yc","Estimate"]),
    SE    = unname(co["Yc","Std. Error"]),
    p     = unname(co["Yc","Pr(>|t|)"]),
    r2_m  = s$r.squared,
    r2_c  = NA_real_,
    n_obs = nrow(df),
    n_sites = dplyr::n_distinct(df$SITE_ID),
    max_rep = if (nrow(df) > 0) df %>% count(SITE_ID, name = "n") %>% pull(n) %>% max(na.rm = TRUE) else NA_real_
  )
}


safe_fit_one <- function(df) {
  out <- try(fit_one(df), silent = TRUE)
  if (inherits(out, "try-error") || is.null(out)) {
    tibble(
      model   = NA_character_,
      Slope   = NA_real_, SE = NA_real_, p = NA_real_,
      r2_m    = NA_real_, r2_c = NA_real_,
      n_obs   = nrow(df),
      n_sites = dplyr::n_distinct(df$SITE_ID),
      max_rep = if (nrow(df) > 0) df %>% count(SITE_ID, name="n") %>% pull(n) %>% max(na.rm=TRUE) else NA_real_
    )
  } else out
}

species_size_trends <- dat_keep %>%
  group_by(LATIN_NAME, SPECIES_NAME) %>%
  group_split(.keep = TRUE) %>%
  purrr::map_dfr(function(df) {
    safe_fit_one(df) %>%
      mutate(
        LATIN_NAME   = df$LATIN_NAME[1],
        SPECIES_NAME = df$SPECIES_NAME[1]
      )
  }) %>%
  relocate(LATIN_NAME, SPECIES_NAME) %>%
  ungroup() %>%
  mutate(
    Sig = dplyr::case_when(
      !is.na(p) & p <= 0.001 ~ "***",
      !is.na(p) & p <= 0.01  ~ "**",
      !is.na(p) & p <= 0.05  ~ "*",
      TRUE ~ ""
    )
  )

# Significant only, by salinity class
sig_trends <- species_size_trends %>% filter(p <= 0.05)


# --- Keep only significant species (p ≤ 0.05)
sig_trends <- species_size_trends %>%
  filter(p <= 0.05) %>%
  mutate(
    SigCat = factor(Sig,
                    levels = c("***", "**", "*"),
                    labels = c("***", "**", "*"))
  )

# --- Define the same significance colour palette ---
sig_cols <- c(
  "***" = "#3b82f6",  # blue
  "**"  = "#22c55e",  # green
  "*"   = "#ef4444"   # red
)

# --- Plot: consistent significance colours ---
ggplot(sig_trends, aes(x = Slope, y = reorder(SPECIES_NAME, Slope))) +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
  geom_errorbarh(aes(xmin = Slope - SE, xmax = Slope + SE),
                 height = 0.25, color = "gray60") +
  geom_point(aes(color = SigCat, shape = model), size = 3) +
  scale_color_manual(values = sig_cols, name = "Significance",
                     breaks = c("***", "**", "*")) +
  scale_shape_manual(values = c("LMM (1|SITE)" = 16, "LM" = 1),
                     name = "Model") +
  labs(
    title = "Significant species body-size trends – Thames Estuary",
    subtitle = "Slope ± SE (mm/year). LMM used when sites have replication; else LM fallback.",
    x = "Change in mean length (mm per year)", 
    y = "Species",
    caption = "Significance: * p ≤ 0.05 | ** p ≤ 0.01 | *** p ≤ 0.001"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    legend.position = "top",
    legend.box = "horizontal",
    legend.title = element_text(size = 11),
    legend.text  = element_text(size = 10)
  )

dat_len <- lengths_clean %>%
  filter(
    str_detect(TOP_TIER_SITE, regex("thames", ignore_case = TRUE)),
    METHOD == "Seine Net",
    !is.na(FISH_LENGTH), FISH_LENGTH > 0,
    !is.na(EVENT_YEAR), !is.na(SURVEY_ID),
    !is.na(SITE_PARENT_NAME), !is.na(SPECIES_NAME)
  )

dat_len2 <- dat_len %>%
  filter(SITE_PARENT_NAME %in% c(
    "Richmond",
    "Kew",
    "Battersea",
    "Greenwich",
    "West Thurrock",
    "Denton Wharf",
    "Stanford-le-Hope Beach"
  ))

thr <- dat_len2 %>%
  group_by(SPECIES_NAME) %>%
  summarise(
    n_len  = n(),
    L_juv  = quantile(FISH_LENGTH, 0.25, na.rm = TRUE),
    L_adlt = quantile(FISH_LENGTH, 0.75, na.rm = TRUE),
    .groups = "drop"
  ) %>% filter(n_len >= 30)

flagged <- dat_len %>%
  inner_join(thr, by = "SPECIES_NAME") %>%
  mutate(
    is_juv   = FISH_LENGTH <= L_juv,
    is_adult = FISH_LENGTH >= L_adlt
  )

sb_svy <- flagged %>%
  group_by(SURVEY_ID, EVENT_YEAR) %>%
  summarise(
    prop_juv   = mean(is_juv,   na.rm = TRUE),
    prop_adult = mean(is_adult, na.rm = TRUE),
    .groups = "drop"
  )

sb_year <- sb_svy %>%
  group_by(EVENT_YEAR) %>%
  summarise(
    mean_prop_juv   = mean(prop_juv),
    se_prop_juv     = sd(prop_juv)/sqrt(n()),
    mean_prop_adult = mean(prop_adult),
    se_prop_adult   = sd(prop_adult)/sqrt(n()),
    .groups = "drop"
  )

sb_long <- sb_year %>%
  transmute(
    EVENT_YEAR,
    Juveniles_mean = mean_prop_juv,   Juveniles_se = se_prop_juv,
    Adults_mean    = mean_prop_adult, Adults_se    = se_prop_adult
  ) %>%
  pivot_longer(-EVENT_YEAR, names_to = c("Group",".value"), names_sep = "_") %>%
  mutate(
    lower = pmax(0, mean - 1.96*se),
    upper = pmin(1, mean + 1.96*se),
    Source = "Survey-balanced"
  )

m_juv <- glmer(
  as.integer(is_juv) ~ scale(EVENT_YEAR) + (1|SITE_PARENT_NAME) + (1|SPECIES_NAME),
  data = flagged, family = binomial()
)
m_ad  <- glmer(
  as.integer(is_adult) ~ scale(EVENT_YEAR) + (1|SITE_PARENT_NAME) + (1|SPECIES_NAME),
  data = flagged, family = binomial()
)

yrs <- sort(unique(flagged$EVENT_YEAR))

emm_juv <- as.data.frame(emmeans(m_juv, ~ EVENT_YEAR, at = list(EVENT_YEAR = yrs), type = "response"))
emm_ad  <- as.data.frame(emmeans(m_ad,  ~ EVENT_YEAR, at = list(EVENT_YEAR = yrs), type = "response"))

prob_col <- if ("prob" %in% names(emm_juv)) "prob" else "emmean"

extract_emm <- function(emm_df, group_label) {
  nm <- names(emm_df)
  
  # estimate column can be "prob", "emmean", or "response"
  est_col   <- intersect(c("prob", "emmean", "response"), nm)
  if (length(est_col) == 0)
    stop("No estimate column found in emmeans output. Available cols: ",
         paste(nm, collapse = ", "))
  
  # CI columns can be "asymp.LCL/UCL" or "lower.CL/upper.CL"
  lower_col <- intersect(c("asymp.LCL", "lower.CL", "LCL"), nm)
  upper_col <- intersect(c("asymp.UCL", "upper.CL", "UCL"), nm)
  if (length(lower_col) == 0 || length(upper_col) == 0)
    stop("No CI columns found in emmeans output. Available cols: ",
         paste(nm, collapse = ", "))
  
  # Year column should be present as EVENT_YEAR (factor or numeric)
  year_col <- "EVENT_YEAR"
  if (!year_col %in% nm)
    stop("EVENT_YEAR not found in emmeans output. Available cols: ",
         paste(nm, collapse = ", "))
  
  tibble::tibble(
    EVENT_YEAR = as.integer(emm_df[[year_col]]),
    mean  = as.numeric(emm_df[[est_col[1]]]),
    lower = as.numeric(emm_df[[lower_col[1]]]),
    upper = as.numeric(emm_df[[upper_col[1]]]),
    Group = group_label,
    Source = "LS-mean (GLMM)"
  )
}

juv_df <- extract_emm(as.data.frame(emm_juv), "Juveniles")
ad_df  <- extract_emm(as.data.frame(emm_ad),  "Adults")

lsm_long <- dplyr::bind_rows(juv_df, ad_df)

both <- bind_rows(sb_long %>% select(EVENT_YEAR, Group, mean, lower, upper, Source),
                  lsm_long)

cols <- c("Survey-balanced" = "#1f77b4", "LS-mean (GLMM)" = "#e41a1c")
fills <- c("Survey-balanced" = "#9ecae1", "LS-mean (GLMM)" = "#fcbba1")
ltys  <- c("Survey-balanced" = "solid",   "LS-mean (GLMM)" = "solid")

ggplot(both, aes(EVENT_YEAR, mean, colour = Source, fill = Source, linetype = Source)) +
  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.20, colour = NA) +
  geom_line(size = 1) +
  geom_point(size = 1.8) +
  facet_wrap(~ Group, ncol = 1, scales = "free_y") +
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  scale_colour_manual(values = cols) +
  scale_fill_manual(values   = fills) +
  scale_linetype_manual(values = ltys) +
  labs(
    title = "Juvenile & adult proportions: survey-balanced vs LS-mean (GLMM)",
    x = "Year", y = "Proportion of measured fish", colour = "", fill = "", linetype = ""
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "top", strip.text = element_text(face = "bold"))

Thames Fisheries data

Thames Fisheries Experiment - angling data